from bertopic import BERTopic
topic_model = BERTopic.load("/home/zhhuang/climate_policy_paper/code/model_save/bert_topic_except_ecolex_model")
/home/zhhuang/anaconda3/envs/climatepolicy/lib/python3.8/site-packages/tqdm/auto.py:22: TqdmWarning: IProgress not found. Please update jupyter and ipywidgets. See https://ipywidgets.readthedocs.io/en/stable/user_install.html from .autonotebook import tqdm as notebook_tqdm
import pandas as pd
df = pd.read_excel("/home/zhhuang/climate_policy_paper/code/data/Topic_docs_time_except_ecolex.xlsx")
docs, timestamp = df["docs"].to_list(), df["Year"].to_list()
topic_model.get_topic_info()['Topic']
topic_model.get_document_info(docs)[["Topic", "Name", "Top_n_words", "Probability", "Representative_document"]]
| Topic | Name | Top_n_words | Probability | Representative_document | |
|---|---|---|---|---|---|
| 0 | -1 | -1_energy_emission_development_support | energy - emission - development - support - me... | 0.767967 | False |
| 1 | -1 | -1_energy_emission_development_support | energy - emission - development - support - me... | 0.936575 | False |
| 2 | -1 | -1_energy_emission_development_support | energy - emission - development - support - me... | 0.382572 | False |
| 3 | -1 | -1_energy_emission_development_support | energy - emission - development - support - me... | 0.798235 | False |
| 4 | -1 | -1_energy_emission_development_support | energy - emission - development - support - me... | 0.356415 | False |
| ... | ... | ... | ... | ... | ... |
| 13585 | 3 | 3_vehicle_passenger_mobility_purchase | vehicle - passenger - mobility - purchase - fr... | 0.146053 | False |
| 13586 | 1 | 1_energy_appliance_vehicle_lamp | energy - appliance - vehicle - lamp - househol... | 0.067121 | False |
| 13587 | 1 | 1_energy_appliance_vehicle_lamp | energy - appliance - vehicle - lamp - househol... | 0.066461 | False |
| 13588 | -1 | -1_energy_emission_development_support | energy - emission - development - support - me... | 0.733766 | False |
| 13589 | -1 | -1_energy_emission_development_support | energy - emission - development - support - me... | 0.371204 | False |
13590 rows × 5 columns
counts = {}
for doc in docs:
for word in doc.split():
counts[word] = counts.get(word, 0) + 1
items = list(counts.items())
items.sort(key=lambda x: x[1], reverse=True)
for i in range(100):
word, count = items[i]
print("{0:<10}{1:>5}".format(word, count))
energy 25647 emission 5860 renewable 5738 efficiency 5610 development 5520 plan 5103 project 4917 electricity 4797 power 4432 system 4222 sector 4168 gas 4076 national 4020 building 3864 support 3836 policy 3601 measure 3558 fuel 3220 vehicle 3166 include 3109 reduce 3099 target 3064 government 3042 increase 2919 technology 2897 standard 2895 tax 2716 public 2603 production 2566 investment 2552 climate 2550 set 2489 programme 2488 environmental 2486 source 2466 reduction 2451 provide 2398 promote 2392 heat 2386 strategy 2326 consumption 2300 transport 2292 `` 2265 aim 2246 program 2160 carbon 2157 requirement 2149 management 2133 industry 2121 establish 2075 sustainable 2066 resource 2001 action 1988 company 1935 develop 1925 solar 1908 supply 1874 market 1867 implementation 1847 plant 1844 capacity 1844 improve 1821 generation 1804 level 1780 cost 1778 country 1773 electric 1758 implement 1753 fund 1730 economic 1714 scheme 1711 objective 1677 activity 1623 product 1600 achieve 1556 change 1530 air 1514 service 1500 wind 1493 '' 1480 goal 1455 efficient 1454 construction 1449 base 1448 ensure 1443 water 1441 build 1422 framework 1393 greenhouse 1390 price 1381 total 1374 environment 1372 green 1371 require 1353 oil 1327 natural 1312 equipment 1310 grant 1297 installation 1286 report 1285
similar_topics, similarity = topic_model.find_topics("Transport", top_n=5)
topic_model.get_topic(similar_topics[0])
[('vehicle', 0.08605256916422592),
('passenger', 0.026346936494183622),
('mobility', 0.02305217605568082),
('purchase', 0.019652099518188953),
('freight', 0.014857606931569211),
('hybrid', 0.014231897748857608),
('battery', 0.013699978374273164),
('traffic', 0.013663290669214097),
('truck', 0.013651079201795197),
('railway', 0.013084886660973443)]
similar_topics, similarity = topic_model.find_topics("Industry", top_n=5)
topic_model.get_topic(similar_topics[0])
[('excise', 0.037326874703956),
('diesel', 0.02948705047722744),
('petroleum', 0.02876827570041448),
('petrol', 0.020565038389434888),
('mw', 0.02022796339307252),
('taxation', 0.019770074703999285),
('hydropower', 0.019591516906273244),
('geothermal', 0.018945125318320953),
('gasoline', 0.018319656785637222),
('generation', 0.016250492549903513)]
similar_topics, similarity = topic_model.find_topics("Energy systems", top_n=5)
topic_model.get_topic(similar_topics[0])
[('refrigeration', 0.21798700806101987),
('refrigerant', 0.13986385917107486),
('conditioning', 0.10668004911735993),
('partnership', 0.0570654327070873),
('preventative', 0.04393385580956857),
('recuperation', 0.04135448989597232),
('certification', 0.039874170027037185),
('dehumidifiers', 0.03898157629443045),
('regenerative', 0.03737406607839815),
('depleting', 0.03548307787665277)]
similar_topics, similarity = topic_model.find_topics("Buildings", top_n=5)
topic_model.get_topic(similar_topics[0])
[('energy', 0.02458405283418319),
('appliance', 0.020051290170197984),
('vehicle', 0.012396776541827042),
('lamp', 0.012010329298167421),
('household', 0.011809807727682293),
('scheme', 0.010853864034571324),
('refrigerator', 0.010808802508147669),
('water', 0.009855005391131003),
('programme', 0.009434125596781176),
('equipment', 0.009407178494799322)]
similar_topics, similarity = topic_model.find_topics("Agriculture, Forestry and Other Land Use", top_n=5)
topic_model.get_topic(similar_topics[0])
[('refrigeration', 0.21798700806101987),
('refrigerant', 0.13986385917107486),
('conditioning', 0.10668004911735993),
('partnership', 0.0570654327070873),
('preventative', 0.04393385580956857),
('recuperation', 0.04135448989597232),
('certification', 0.039874170027037185),
('dehumidifiers', 0.03898157629443045),
('regenerative', 0.03737406607839815),
('depleting', 0.03548307787665277)]
len(docs)
13590
import os
images_path = "/home/zhhuang/climate_policy_paper/paper_images"
if not os.path.exists(images_path):
os.makedirs(images_path)
import plotly.io as pio
pio.kaleido.scope.default_format = "svg"
# pio.kaleido.scope.mathjax = "https://cdnjs.cloudflare.com/ajax/libs/mathjax/2.7.5/MathJax.js
fig = topic_model.visualize_barchart(top_n_topics=20, n_words=10, width=300, height=300)
pio.write_image(fig, '/home/zhhuang/climate_policy_paper/paper_images/topic_except_ecolex_barchart.svg')
# fig = topic_model.visualize_barchart(top_n_topics = 20, n_words=10, width = 300, height= 300)
# fig.write_html("/home/zhhuang/climate_policy_paper/paper_images/topic_barchart.png", engine="kaleido")
# img_bytes = fig.to_image(format="png", width=600, height=350, scale=2)
# Image(img_bytes)
fig
# topic_model.visualize_barchart(top_n_topics = 20, n_words=10,width = 300, height= 300)
fig2 = topic_model.visualize_heatmap()
# fig = topic_model.visualize_barchart(top_n_topics = 20, n_words=10, width = 300, height= 300)
pio.write_image(fig2, '/home/zhhuang/climate_policy_paper/paper_images/topic_except_ecolex_heatmap.svg')
fig2
fig3 = topic_model.visualize_topics()
pio.write_image(fig3, '/home/zhhuang/climate_policy_paper/paper_images/topic_except_ecolex_visualize_topics.svg')
fig3
# hierarchical_topics = topic_model.hierarchical_topics(docs)
# # print(hierarchical_topics)
# with pd.ExcelWriter("Topic_hierarchical_topics_except_ecolex.xlsx", engine='xlsxwriter',
# engine_kwargs={'options': {'strings_to_urls': False}}) as writer:
# hierarchical_topics.to_excel(writer)
hierarchical_topics = pd.read_excel("/home/zhhuang/climate_policy_paper/code/Topic_hierarchical_topics_except_ecolex.xlsx")
fig4 = topic_model.visualize_hierarchy(hierarchical_topics=hierarchical_topics)
pio.write_image(fig4, '/home/zhhuang/climate_policy_paper/paper_images/topic_except_ecolex_hierarchical_topics.svg')
fig4
for index, i in enumerate(timestamp):
if i == '0':
timestamp[index] = '2020'
else:
timestamp[index] = str(i)
topics_over_time = topic_model.topics_over_time(docs, timestamp, datetime_format="%Y", nr_bins=20)
with pd.ExcelWriter("Topic_except_ecolex_topics_over_time.xlsx", engine='xlsxwriter',
engine_kwargs={'options': {'strings_to_urls': False}}) as writer:
topics_over_time.to_excel(writer)
20it [4:26:28, 799.41s/it]
topics_over_time = pd.read_excel("/home/zhhuang/climate_policy_paper/code/Topic_except_ecolex_topics_over_time.xlsx")
fig5 = topic_model.visualize_topics_over_time(topics_over_time)
pio.write_image(fig5, '/home/zhhuang/climate_policy_paper/paper_images/topic_except_ecolex_visualize_topics_over_time.svg')
fig5